#!/usr/bin/env Rscript


## Identifies the approximate number of most relevant transcripts based on finding the point representing the elbow in the curve of expression ~ ordered transcript 
    

main = function () {
    
            
    suppressPackageStartupMessages(library("argparse"))
    suppressPackageStartupMessages(library("tidyverse"))


    
    parser = ArgumentParser()
    parser$add_argument("--E_inputs", help="file.isoform.TMM.EXPR.matrix.E-inputs", required=TRUE, nargs=1)
    parser$add_argument("--out_pdf", help="name for output pdf filename", required=FALSE, nargs=1, default="estimate_TPM_threshold.pdf")
    args = parser$parse_args()
    	
    E_inputs_filename = args$E_inputs
    output_pdf_filename = args$out_pdf

	message("-parsing:", E_inputs_filename)

    if (grepl(".gz$", E_inputs_filename)) {
		data = read.table(gzfile(E_inputs_filename), header=T, com='', sep="\t", stringsAsFactors = F)
	} else {
		data = read.table(E_inputs_filename, header=T, com='', sep="\t", stringsAsFactors = F)
    }

    orig_data = data
    
	data = data %>% filter(max_expr_over_samples > 0.05 & max_expr_over_samples <= 2^5) # must have some evidence of expression


    if (nrow(data) < 1000) {
    	message("Too few data points in the low-expression range to perform a meaningful analysis here.")
    	quit(save = "no", status = 0, runLast = FALSE)
	}
    
	data = data %>% arrange(desc(max_expr_over_samples)) %>% mutate(r=row_number())

	data = data %>% mutate(logexpr = log2(max_expr_over_samples+1)) 

    normalize = function(x) {
  
		x = unlist(x)
  		min_x = min(x)
 		max_x = max(x)
  
    
  		vals =  (x-min_x)/(max_x-min_x)
  		return(vals)
	}


	data$x = normalize(list(data$logexpr))
	data$y = normalize(list(data$r))
    


	message("-performing elbow analysis to select threshold.")
	get_dist_point_line <- function(point,
                                line_coord1,
                                line_coord2) {
		# derived from https://github.com/ropensci/pathviewr/blob/HEAD/R/analytical_functions.R
    	## Compute
    	v1 <- line_coord1 - line_coord2
    	v2 <- point - line_coord1
    	m <- cbind(v1, v2)
    	dist <- abs(det(m)) / sqrt(sum(v1 * v1))
  
  		## export
  		return(dist)
	}


    # get a line from the first and last point of the elbow curve
    data = data %>% arrange(x)
	first_pt = data %>% head(n=1)
	last_pt = data %>% tail(n=1)
	xs = c(first_pt$x, last_pt$x)
	ys = c(first_pt$y, last_pt$y)
	line = lm(ys ~ xs)
	coeffs = coefficients(line)
	intercept = coeffs[1]
	slope = coeffs[2]

	dist_to_line_df = do.call(rbind, apply(data.frame(x=data$x, y=data$y), 1, function(row) { 

	 	x = row[1]
 	 	y = row[2]
  
  		point_dist = get_dist_point_line(c(x,y), c(first_pt$x, first_pt$y), c(last_pt$x, last_pt$y))
  
  		return(data.frame(dist=point_dist))
  
    }) )

	data = bind_cols(data, dist_to_line_df)

    # define threshold as that transcript with greatest distance to the line (elbow transcript)
	threshold_entry = data %>% arrange(desc(dist)) %>% head(n=1) 

    tpm_x = threshold_entry$max_expr_over_samples



	# compute some stats based on that threshold
    filtered_data = orig_data %>% filter(max_expr_over_samples >= tpm_x) %>% arrange(desc(length))
    sum_length = sum(filtered_data$length)
	filtered_data = filtered_data %>% mutate(cumsum_len = cumsum(length))
	half_length = sum_length / 2
	N50_entry = filtered_data %>% filter(cumsum_len <= half_length) %>% filter(row_number() == n())
	Ex = threshold_entry %>% pull(X.Ex)
	N50 = N50_entry %>% pull(length)
	logexpr_x = threshold_entry$logexpr

    # ensure line is above the elbow
    line_y = threshold_entry$x * slope + intercept
    

    pdf(output_pdf_filename)
    p = data %>% ggplot(aes(x=logexpr, y=r)) + geom_point()
    
    if (line_y < threshold_entry$y) {
    	message("Not finding a lower elbow curve, cannot estimate min threshold")
    	
    }
    else {
    
		num_transcripts =  data %>% filter(max_expr_over_samples >= tpm_x) %>% nrow()

   		message("-Number of transcripts >= tpm threshold: ", num_transcripts)
		message("-Fraction of expression data represented: ", Ex)
    	message("-Contig N50 based on these transcripts: ", N50)
    	message("-selected TPM threshold: ", tpm_x)
     	
  		p = p + geom_vline(xintercept=logexpr_x, color='red') + 
  		annotate("text", x=threshold_entry$logexpr, y=num_transcripts, 
           label=paste0("    # transcripts =", num_transcripts, " Ex=", Ex, " N50=", N50, " TPMthresh=", tpm_x), hjust=0)

    }

   	plot(p)

	    
	message("-Done. See pdf: estimate_TPM_threshold.pdf")
        
    quit(save = "no", status = 0, runLast = FALSE)

}


if (length(sys.calls())==0) {
    main()
}
